In [1]:
%pylab
%matplotlib inline


Using matplotlib backend: agg
Populating the interactive namespace from numpy and matplotlib

In [2]:
cd ..


/home/scott/Documents/git/neukrill-net-work

In [7]:
import sys
import numpy as np
import skimage
import cv2
import sklearn

In [4]:
from IPython.display import display
from IPython.display import Image
from IPython.display import HTML

In [5]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

In [6]:
import neukrill_net.utils as utils
import neukrill_net.image_processing as image_processing
import neukrill_net.bagofwords as bagofwords

In [8]:
# Load the settings 
settings = utils.Settings('settings.json')

In [9]:
rawdata, labels = utils.load_rawdata(settings.image_fnames, classes=settings.classes)

In [10]:
label_encoder = sklearn.preprocessing.LabelEncoder()
y = label_encoder.fit_transform(labels)

In [11]:
bow_options = {'verbose':True, 'n_features_max':100, 'patch_size':15, 'clusteralgo':'kmeans', 'n_clusters':20, 'random_seed':42}

In [23]:
bow = bagofwords.Bow(normalise_hist=False, **bow_options)

In [13]:
sample = np.random.random_integers(0, len(rawdata)-1, size=(1000))

In [24]:
bow.build_vocabulary([rawdata[i] for i in sample])


Describing the keypoints of 1000 images
Clustering patch descriptors to form vocabulary

Visualising vocabulary clustering of keypoints

Setup


In [37]:
# For each image, get all keypoint descriptions
dscdata = [bow.describeImage(rawdata[i]) for i in sample]

# Remove empty descriptions from images without any keypoints
dscdata = [x for x in dscdata if x is not None]

# Flatten so we have all keypoints from all images on top of each other
dscdata = np.vstack(dscdata)

dscclass = bow.cluster.predict(dscdata)

PCA


In [55]:
reduced_data = sklearn.decomposition.PCA(n_components=3).fit_transform(dscdata)

In [56]:
plt.scatter(reduced_data[:,0], reduced_data[:,1], c=dscclass, alpha=0.5, linewidths=0, s=1)
plt.show()
plt.scatter(reduced_data[:,0], reduced_data[:,2], c=dscclass, alpha=0.5, linewidths=0, s=1)
plt.show()
plt.scatter(reduced_data[:,1], reduced_data[:,2], c=dscclass, alpha=0.5, linewidths=0, s=1)
plt.show()


ICA


In [53]:
reduced_data = sklearn.decomposition.FastICA(n_components=3).fit_transform(dscdata)


/home/scott/Documents/git/neukrill-venv3/lib/python3.4/site-packages/sklearn/decomposition/fastica_.py:282: DeprecationWarning: Implicitly casting between incompatible kinds. In a future numpy release, this will raise an error. Use casting="unsafe" if this is intentional.
  X -= X_mean[:, np.newaxis]

In [54]:
plt.scatter(reduced_data[:,0], reduced_data[:,1], c=dscclass, alpha=0.5, linewidths=0, s=1)
plt.show()
plt.scatter(reduced_data[:,0], reduced_data[:,2], c=dscclass, alpha=0.5, linewidths=0, s=1)
plt.show()
plt.scatter(reduced_data[:,1], reduced_data[:,2], c=dscclass, alpha=0.5, linewidths=0, s=1)
plt.show()


Factor Analysis


In [57]:
reduced_data = sklearn.decomposition.FactorAnalysis(n_components=3).fit_transform(dscdata)

In [58]:
plt.scatter(reduced_data[:,0], reduced_data[:,1], c=dscclass, alpha=0.5, linewidths=0, s=1)
plt.show()
plt.scatter(reduced_data[:,0], reduced_data[:,2], c=dscclass, alpha=0.5, linewidths=0, s=1)
plt.show()
plt.scatter(reduced_data[:,1], reduced_data[:,2], c=dscclass, alpha=0.5, linewidths=0, s=1)
plt.show()


Training with Logistic Regression


In [27]:
X = [bow.compute_image_bow(img) for img in rawdata]
X = np.vstack(X)

In [16]:
cv = sklearn.cross_validation.StratifiedShuffleSplit(y)

In [17]:
clf = sklearn.linear_model.LogisticRegression()

In [28]:
X[:10]


Out[28]:
array([[  1.,   0.,   3.,   1.,   7.,   0.,   0.,   0.,   0.,   0.,   2.,
          0.,   2.,   0.,   1.,   4.,   1.,   0.,   2.,   0.],
       [  2.,   0.,   2.,   3.,  11.,   3.,   1.,   3.,   2.,   0.,   6.,
          0.,   5.,   1.,   2.,   6.,   1.,   1.,   4.,   6.],
       [  1.,   0.,   1.,   1.,   9.,   0.,   1.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,  11.,   0.,   0.,   3.,   0.],
       [  1.,   0.,   2.,   0.,   1.,   0.,   0.,   0.,   2.,   0.,   2.,
          0.,   0.,   2.,   5.,  10.,   6.,   0.,   3.,   2.],
       [  2.,   0.,   0.,   3.,   5.,   4.,   0.,   3.,   0.,   0.,   7.,
          0.,   1.,   1.,   1.,  15.,   1.,   0.,   1.,   0.],
       [  1.,   0.,   0.,   0.,   1.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   1.,   0.,   0.,   5.,   2.,   0.,   0.,   0.],
       [  9.,   0.,   0.,   0.,   7.,   0.,   0.,   9.,   0.,   0.,   1.,
          0.,   0.,   2.,   1.,   5.,   0.,   0.,   3.,   1.],
       [  8.,   0.,   0.,   1.,   7.,   0.,   0.,   3.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   3.,   1.,   0.,   6.,   1.],
       [  0.,   0.,   2.,   1.,   6.,   1.,   0.,   0.,   0.,   0.,   0.,
          0.,   2.,   0.,   0.,  10.,   1.,   0.,   7.,   1.],
       [  4.,   0.,   3.,   3.,   5.,   3.,   2.,   5.,   0.,   0.,   4.,
          0.,   0.,   0.,   0.,   5.,   0.,   0.,   1.,   0.]])

In [26]:
bow.compute_image_bow(rawdata[0])


Out[26]:
array([1, 0, 3, 1, 7, 0, 0, 0, 0, 0, 2, 0, 2, 0, 1, 4, 1, 0, 2, 0])

In [29]:
print('Cross-validating')
results = []
for train, test in cv:
    # Make a new BOW encoding
    #bow = bagofwords.Bow(**bow_options)
    #bow.build_vocabulary([rawdata[i] for i in train])
    #X = [bow.compute_image_bow(img) for img in rawdata]

    clf.fit(X[train], y[train])
    p = clf.predict_proba(X[test])
    res = sklearn.metrics.log_loss(y[test], p)
    print(res)
    results.append(res)


Cross-validating
2.4409929416
2.46779735961
2.41937293395
2.47116204372
2.45700342345
2.44456435296
2.46251010289
2.4428022459
2.43156976094
2.45793841621

Try to predict classes of test data


In [30]:
print('Fitting clf to all training data')
clf.fit(X,y)


Fitting clf to all training data
Loading the raw test data
Bagging words for raw test data

In [ ]:
print('Loading the raw test data')
rawtest, names = utils.load_rawdata(settings.image_fnames)

In [34]:
print('Bagging words for raw test data')
X2 = [bow.compute_image_bow(img) for img in rawtest]
X2 = np.vstack(X2)
p = clf.predict_proba(X2)


Bagging words for raw test data

In [32]:
len(settings.image_fnames['test'])


Out[32]:
130400

In [33]:
X2.shape


Out[33]:
(30336, 20)

Visualise Logistic Regression classes